import hazm as hz
from collections import defaultdict
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score, precision_score, accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from difflib import SequenceMatcher
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
import matplotlib.pyplot as plt
train_ = pd.read_csv("train.csv")
def draw_dictionary(dic):
return px.bar(pd.DataFrame(dic,index=[0]).transpose())
Initialize dictionary for keeping the counts of evidence words
Count of each word in each category , chaining these together for each sentence for all of it's words , gives us a probability
category_to_word_count = defaultdict(lambda: defaultdict(lambda: 0))
Remove highly occuring words taht don't adr to the prediction
highly_occuring = pd.DataFrame(pd.DataFrame(category_to_word_count).mean().sort_values(ascending=False)).transpose()
Stemming finds the stem of a word (ریشه کلمه) Stemming is not enough because in some cases we have verbs , converting werbs to their participles helps uniform werbs for decision making In the below code we have both stemmized and lemitized our words to form a dictionary that serves as a probablillity database
normalizer = hz.Normalizer()
stemmer = hz.Stemmer()
lemitizer = hz.Lemmatizer()
def f(r):
useless = ["که", "از", 'را', "شده", "در", "که", "با", "هست", "شد", "این", "به", "ه", "تا", "»", "«",
"در", "دارند", "همچنین", "میدهد", "به", "و", "است", "آن", "شده", ".", ",", "اس", "جه", "،","#"]
for e in useless:
if (e in r):
return False
if len(r) < 3:
return False
return True
def useless_words_filter(x):
return [e for e in x if f(e)]
train_['Normalized'] = train_['content'].map(lambda x: normalizer.normalize(x))
train_['unary_tokens'] = train_['Normalized'].map(
lambda x: hz.word_tokenize(x))
train_['unary_tokens'] = train_['unary_tokens'].map(
lambda x: useless_words_filter([lemitizer.lemmatize(stemmer.stem(y)) for y in x]))
# from wordcloud import WordCloud
# from wordcloud_fa import WordCloudFa
# for category in train_['label'].unique():
# text = "".join(i for i in train_.content)
# stopwords = set(hz.stopwords_list())
# wordcloud = WordCloudFa(stopwords=stopwords,
# background_color="white").generate(text)
# plt.figure(figsize=(15, 10))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()
Posterior probability -> the probability of a category being seen subject to seeing a word Likelihood -> the likelihoods of words in a sentence being seen in a category that can be chained as they are independent Class prior probability -> starts off with 1/6 , it's the basic probability of a class being seen (if we have a uniform distribution it is 1/6 if we have 6 categories) Evidence -> or predictor prior probability is the chain of probability until now that has had the likelihood affected in them basically each time we see a word in a sentence we multiply it's probability of being seen in that category in it's prior probablity of belonging into that category
After doing what is above for all categories and for all the words in a category with see the argmax of the probability which determines our label
شیر آب بسته نبود شیر جنگل سلطان دشت هاست One might interpret both as Lion or Tap, while the meaning of shir is once shir as tap and shir as an animal ,bigram is enough , but we can alter the sentence to make it harder and require N-grams , شیر در جنگل غرش می کند for example would require us to have a 3 gram
def update_dic_with_evidence(token, label):
category_to_word_count[token][label] += 1
train_.apply(lambda x: list(map(lambda token: update_dic_with_evidence(
token, x['label']), x['unary_tokens'])), axis=1)
0 [None, None, None, None, None, None, None, Non...
1 [None, None, None, None, None, None, None, Non...
2 [None, None, None, None, None, None, None, Non...
3 [None, None, None, None, None, None, None, Non...
4 [None, None, None, None, None, None, None, Non...
...
8995 [None, None, None, None, None, None, None, Non...
8996 [None, None, None, None, None, None, None, Non...
8997 [None, None, None, None, None, None, None, Non...
8998 [None, None, None, None, None, None, None, Non...
8999 [None, None, None, None, None, None, None, Non...
Length: 9000, dtype: object
category_word = pd.DataFrame(category_to_word_count)
category_word_t = category_word.transpose()
grouped_sum = category_word.transpose().sum()
for e in category_word_t.keys():
max_words = category_word_t[e].sort_values(ascending=False).head(5)
plt=px.bar(max_words)
plt.show()
def calculate_posterior(row, cat):
result = (10 ** 300)/6
for word in row['unary_tokens']:
if (pd.isnull(category_word[word][cat])):
result *= (1 / len(category_word.keys()))
else:
result *= (category_word[word][cat] * 100 / grouped_sum[cat])
return result
def calculate_posterior_no_smoothing(row, cat):
result = (10 ** 300)/6
for word in row['unary_tokens']:
if (pd.isnull(category_word[word][cat])):
result *= 0
else:
result *= (category_word[word][cat] / grouped_sum[cat])
return result
for cat in grouped_sum.keys():
train_[
cat + '_posterior'] = train_.apply(lambda row: calculate_posterior_no_smoothing(row, cat), axis=1)
train_["prediction"] = train_.iloc[:, 4:].idxmax(
axis=1).map(lambda x: x.replace("_posterior", ""))
def calc_precision_for_label(label,df):
return len(df[(df['label'] == label) & (df['prediction'] == label)]) / len(df[df['prediction'] == label])
def calc_recall_for_label(label,df):
return len(df[(df['label'] == label) & (df['prediction'] == label)]) / len(df[df['label'] == label])
def precision(df):
return dict(list(map(lambda x : (x,calc_precision_for_label(x,df)),df['label'].unique())))
def recall(df):
return dict(list(map(lambda x: (x,calc_recall_for_label(x,df)), df['label'].unique())))
def accuracy(truth,prediction):
return len(list(filter(lambda x : x, truth == prediction))) / len(truth)
def F1 (recalls,precisions):
(recalls * precisions) * 2 / (recalls + precisions)
cf = confusion_matrix(train_['label'], train_[
'prediction'], labels=list(train_['label'].unique()))
r = ConfusionMatrixDisplay(
confusion_matrix=cf, display_labels=list(train_['label'].unique()))
print("Accuracy " + str(accuracy(train_['label'], train_['prediction'])))
print("Precision")
draw_dictionary(precision(train_)).show()
print("Recall")
draw_dictionary(recall(train_)).show()
recalls = pd.DataFrame(recall(train_), index=[0])
precisions = pd.DataFrame(precision(train_), index=[0])
F1 = ((recalls * precisions) * 2)/(recalls + precisions)
print("F1")
draw_dictionary(F1).show()
r.plot(xticks_rotation=45)
Accuracy 0.8584444444444445 Precision
Recall
F1
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5ec0381cc0>
for cat in grouped_sum.keys():
train_[
cat + '_posterior'] = train_.apply(lambda row: calculate_posterior(row, cat), axis=1)
train_["prediction"] = train_.iloc[:, 4:10].idxmax(
axis=1).map(lambda x: x.replace("_posterior", ""))
cf = confusion_matrix(train_['label'], train_[
'prediction'], labels=list(train_['label'].unique()))
r = ConfusionMatrixDisplay(
confusion_matrix=cf, display_labels=list(train_['label'].unique()))
print("Accuracy " + str(accuracy(train_['label'], train_['prediction'])))
print("Precision")
draw_dictionary(precision(train_)).show()
print("Recall")
draw_dictionary(recall(train_)).show()
recalls = pd.DataFrame(recall(train_), index=[0])
precisions = pd.DataFrame(precision(train_), index=[0])
F1 = ((recalls * precisions) * 2)/(recalls + precisions)
print("F1")
draw_dictionary(F1).show()
r.plot(xticks_rotation=45)
Accuracy 0.9364444444444444 Precision
Recall
F1
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5ec08af550>
test_ = pd.read_csv("test.csv")
test_['Normalized'] = test_['content'].map(lambda x: normalizer.normalize(x))
test_['unary_tokens'] = test_['Normalized'].map(
lambda x: hz.word_tokenize(x))
test_['unary_tokens'] = test_['unary_tokens'].map(
lambda x: useless_words_filter([lemitizer.lemmatize(stemmer.stem(y)) for y in x]))
def test_calculate_posterior(row, cat):
result = (10 ** 300)/6
for word in row['unary_tokens']:
if (word not in category_word.keys()):
continue
if (pd.isnull(category_word[word][cat])):
result *= (1 / len(category_word.keys()))
else:
result *= (category_word[word][cat] * 100 / grouped_sum[cat])
return result
def test_calculate_posterior_no_smoothing(row, cat):
result = (10 ** 300)/6
for word in row['unary_tokens']:
if (word not in category_word.keys()):
continue
if (pd.isnull(category_word[word][cat])):
result *= 0
else:
result *= (category_word[word][cat] * 100 / grouped_sum[cat])
return result
def Weighted_F1(F1, df):
weights = test_.groupby('label').count()['content'].transpose()
weighted_sums = np.array(F1.keys().map(lambda x: weights[x] * F1[x]))
return weighted_sums.sum()/weights.sum()
def Macro_F1(F1):
return F1.values.mean()
def Micro_averaged_F1(df):
# In multi class classification it's obvious that micro averages F1 which is
# TP / NP + (FP + FN) / 2
return accuracy(df['label'],df['prediction'])
for cat in grouped_sum.keys():
test_[
cat + '_posterior'] = test_.apply(lambda row: test_calculate_posterior_no_smoothing(row, cat), axis=1)
test_["prediction"] = test_.iloc[:, 4:10].idxmax(
axis=1).map(lambda x: x.replace("_posterior", ""))
cf = confusion_matrix(test_['label'], test_[
'prediction'], labels=list(test_['label'].unique()))
r = ConfusionMatrixDisplay(
confusion_matrix=cf, display_labels=list(test_['label'].unique()))
print("Accuracy " + str(accuracy(test_['label'], test_['prediction'])))
print("Precision")
draw_dictionary(precision(test_)).show()
print("Recall")
draw_dictionary(recall(test_)).show()
recalls = pd.DataFrame(recall(test_), index=[0])
precisions = pd.DataFrame(precision(test_), index=[0])
F1 = ((recalls * precisions) * 2)/(recalls + precisions)
print("F1")
draw_dictionary(F1).show()
r.plot(xticks_rotation=45)
Accuracy 0.6206278026905829 Precision
Recall
F1
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5eb27e23e0>
print ("Weighted F1" + str(Weighted_F1(F1,test_)))
print("Micro Averaged F1 " + str(Micro_averaged_F1(test_)))
print("Macroaveraged F1" ,Macro_F1(F1))
Weighted F10 0.643279 dtype: float64 Micro Averaged F1 0.6206278026905829 Macroaveraged F1 0.6400315299435059
for cat in grouped_sum.keys():
test_[
cat + '_posterior'] = test_.apply(lambda row: test_calculate_posterior(row, cat), axis=1)
test_["prediction"] = test_.iloc[:, 4:10].idxmax(
axis=1).map(lambda x: x.replace("_posterior", ""))
cf = confusion_matrix(test_['label'], test_[
'prediction'], labels=list(test_['label'].unique()))
r = ConfusionMatrixDisplay(
confusion_matrix=cf, display_labels=list(test_['label'].unique()))
print("Accuracy " + str(accuracy(test_['label'], test_['prediction'])))
print("Precision")
draw_dictionary(precision(test_)).show()
print("Recall")
draw_dictionary(recall(test_)).show()
recalls = pd.DataFrame(recall(test_), index=[0])
precisions = pd.DataFrame(precision(test_), index=[0])
F1 = ((recalls * precisions) * 2)/(recalls + precisions)
print("F1")
draw_dictionary(F1).show()
r.plot(xticks_rotation=45)
Accuracy 0.905829596412556 Precision
Recall
F1
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f5e8d389f90>
print ("Weighted F1" + str(Weighted_F1(F1,test_)))
print("Micro Averaged F1 " + str(Micro_averaged_F1(test_)))
print("Macroaveraged F1" ,Macro_F1(F1))
Weighted F10 0.905921 dtype: float64 Micro Averaged F1 0.905829596412556 Macroaveraged F1 0.9058152832095395
neither Recall or Precision alone is not enough , for example in the titanic dataset , if we allow our model to only predict and classify people as dead , we will get a 100 percent precision in the dead category because we have predicted all the dead people correctly , but we get 0 precision in the alive field , also , most of our predictions will be true if the distribution of the database is imbalanced in this case recall will show us the better side , which is that our false positives were alot and affects our over all measure another example is an email system , a model that incorrectly flags all emails as spam will have 100 % precision because it has detected spams completely but it will have0 recall because out there will be no true positives
according to https://towardsdatascience.com/the-f1-score-bec2bbc38aa6 F1 is using the harmonic mean of precision and recall for each category
... quoting A model will obtain a high F1 score if both Precision and Recall are high A model will obtain a low F1 score if both Precision and Recall are low A model will obtain a medium F1 score if one of Precision and Recall is low and the other is high which means F1 Is trying to represent a metric that combines both precision and recall and is mostly used when we don't know which one suits our requirements well in many situations like benchmarking or grid search for optimization one metric is easier to use
The result of the research has become how the functions for metrics have been written
In Naive bayes we multipy the prior probability with the current probability , therefore if the current probability which an example is described in the 11 and && 5 section is zero , we classify that text to definetly not be of it's own class because we have not seen any evidence before , this is problematic , therefore by adding a submissable probablity of each unseen word in a category , we can avoid this issue
Smoothing had the msot effect on technological class , Accuracy and Micro F1 are the same because in multi class classification the formula shrinks to total accuracy according to the classification metrics link here -> https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1 since categories that have the most unique words are easier to classify we get the best performance on sports and health , technology is more prune to sparsing because it shares alot of it's words with other categories.